"""
Author: Shaoyu Liu
generate cumulative omission
original file: 09/20/2023 in ipynb
rewrite on 05/23/2024 to .py script for replication
"""

import numpy as np
import pandas as pd
import os

dir = r'/Volumes/Zihao_SSD2/PatentsView/'
df_reg = pd.read_csv(dir + "regdata/reg_panel.csv", usecols=["patent_id_i", "patent_id_j", "omission", "patent_year_i", "patent_year_j"])
print(df_reg["patent_id_i"].nunique(), df_reg["patent_id_j"].nunique(), len(df_reg)) # (1534975 1078407 4446057)

# Count omission for focal patent (patent_id_j) when it is omitted by patent i in year when i is published.
df_omission = df_reg.groupby(["patent_id_j", "patent_year_i"])["omission"].sum().reset_index()
df_omission.rename(columns={"patent_id_j": "patent_id", "patent_year_i": "omission_year"}, inplace=True)
del df_reg

df_inventor = pd.read_csv(dir + "cleandata/g_inventor_gender_race_age.csv", usecols=["patent_id", "patent_year", "inventor_id", "firstpub_year", "gender_09_100", "race80"])

# Merge df_omission with df_inventor, each patent can have several inventors
# The resulting dataframe contains inventors who ever gets omitted
df_cross = df_omission.merge(df_inventor, on=["patent_id"], how="inner")
df_cross.sort_values(["inventor_id", "omission_year"], inplace=True)

# Inventor-year level omission
df_inventor_om = (df_cross.groupby(["inventor_id", "omission_year"])["omission"].sum().reset_index())

# Cumulative number of omissions experienced by the inventor since her first time of omission
df_inventor_om["cumsum_omission"] = df_inventor_om.groupby(["inventor_id"])["omission"].cumsum()

print("Number of inventors who ever gets omitted", df_inventor_om["inventor_id"].nunique()) # 692483

# Total set of inventors, drop those who NEVER gets omitted
df_inventor_year = df_inventor[df_inventor["inventor_id"].isin(set(df_inventor_om.inventor_id))][["inventor_id", "firstpub_year"]].drop_duplicates()
df_inventor_year.rename(columns={"firstpub_year": "min_year"}, inplace=True)

min_yr_dict = dict(zip(df_inventor_year.inventor_id, df_inventor_year.min_year))

# Fill in missing rows between min year and 2015 for each inventor
df_yr = pd.DataFrame({"year": list(range(1976, 2015 + 1))})
df_inv = df_inventor_year[["inventor_id"]].reset_index(drop=True)
df_yr["key"] = 0
df_inv["key"] = 0
df_inv_yr = df_yr.merge(df_inv, on="key", how="outer")
df_inv_yr.drop(columns=["key"], inplace=True)

df_inv_yr["min_year"] = df_inv_yr["inventor_id"].map(min_yr_dict)
df_inv_yr = df_inv_yr[(df_inv_yr["year"] >= 1981) & (df_inv_yr["year"] >= df_inv_yr["min_year"])]
print(len(df_inv_yr)) # 13365549

df_inv_yr = df_inv_yr.merge(
    df_inventor_om,
    left_on=["inventor_id", "year"],
    right_on=["inventor_id", "omission_year"],
    how="left",
)
df_inv_yr = df_inv_yr.drop(columns=["omission_year"]).sort_values(["inventor_id", "year"])
df_inv_yr["omission"] = df_inv_yr["omission"].fillna(0)
print("Total number of inventors: ", df_inv_yr["inventor_id"].nunique()) # 692483

df_inv_yr["cumsum_omission"] = df_inv_yr.groupby("inventor_id")["cumsum_omission"].transform(lambda v: v.ffill())
df_inv_yr["cumsum_omission"].fillna(0, inplace=True)

# Clean up datatypes
df_inv_yr["year"] = df_inv_yr["year"].astype("int16")
df_inv_yr["min_year"] = df_inv_yr["min_year"].astype("int16")
df_inv_yr["omission"] = df_inv_yr["omission"].astype("int8")
df_inv_yr["cumsum_omission"] = df_inv_yr["cumsum_omission"].astype("int16")
df_inv_yr.to_csv(dir + "cleandata/inventor_cum_omission.csv", index=False)